Imports
Load data
pycombat_batch_corrected_counts_path = "data/batch_corrected_data/pycombat_batch_corrected_counts.tsv"
metadata_path = "data/formatted_rnaseq_data/metadata.tsv"
pycombat_batch_corrected_dex_results_path = "data/differential_analysis/pycombat_batch_corrected_dex_results.tsv"
pycombat_batch_corrected_dex_selected_genes_path = "data/differential_analysis/pycombat_batch_corrected_dex_selected_genes.tsv"
pycombat_batch_corrected_counts = as.data.frame(read.table(
pycombat_batch_corrected_counts_path,
sep = "\t",
header = TRUE,
row.names = 1
))
print(pycombat_batch_corrected_counts)
metadata = as.data.frame(read.table(
metadata_path,
sep = "\t",
header = TRUE,
row.names = 1
))
print(metadata)
DESeq
Setup dds
design = ~ type + dataset_name
dds = DESeqDataSetFromMatrix(
countData = pycombat_batch_corrected_counts,
colData = metadata,
design = design)
Warning: some variables in design formula are characters, converting to factors
Run deseq
dds <- DESeq(dds, test="LRT", reduced = ~ dataset_name)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
-- replacing outliers and refitting for 2239 genes
-- DESeq argument 'minReplicatesForReplace' = 7
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing
save(dds, file = paste0("data/differential_analysis/pycombat_batch_corrected_counts_dds.RData"))
Examine results
res = as.data.frame(results(dds))
# Format results
res = as.data.frame(res)
res = res %>%
relocate(padj) %>%
relocate(pvalue)
print(res)
res_path = paste0(pycombat_batch_corrected_dex_results_path)
write.table(res, file = res_path, sep = "\t")
res$abs_l2fc = abs(res$log2FoldChange)
print(res)
Select features
Visualize spread of l2fc values to determine a sensible cutoff for
feature selection. Note that l2fc does not have a direct meaning since
we’re doing an LRT test.
# Calculate the cumulative count
res$abs_l2fc = abs(res$log2FoldChange)
sorted_abs_l2fc <- sort(res$abs_l2fc)
cumulative_counts <- cumsum(table(sorted_abs_l2fc))
# Plot the cumulative counts
plot(names(cumulative_counts), cumulative_counts,
type="s", # Step plot
main="Cumulative Count Plot for L2FC of genes",
xlab="L2FC Value",
ylab="Cumulative Count",
col="blue")
cutoff = 2 # Chosen from looking at the plot
# Add a horizontal line at cutoff
abline(v = cutoff, col="red", lty=2)

selected_genes = res[res$abs_l2fc > cutoff,]
print(selected_genes)
selected_genes_path = paste0(pycombat_batch_corrected_dex_selected_genes_path)
write.table(selected_genes, file = selected_genes_path, sep = "\t")
# # Identify inflection point
# x_values <- as.numeric(names(cumulative_counts))
# y_values <- as.numeric(cumulative_counts)
#
# # Approximate first and second derivatives
# first_derivative <- diff(y_values)
# second_derivative <- diff(first_derivative)
#
# inflection_point_index <- which(diff(sign(second_derivative)) != 0) + 1
# inflection_point <- x_values[inflection_point_index]
#
# # Add the inflection point to the plot
# points(inflection_point, y_values[inflection_point_index], col="red", pch=19)
# abline(v = inflection_point, col="red", lty=2)
#
# # Print the inflection point
# print(paste("Inflection point at x =", inflection_point))
LS0tCnRpdGxlOiAiRGlmZmVyZW50aWFsIGFuYWx5c2lzIgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgojIyBJbXBvcnRzCgpgYGB7ciBsb2FkIHBhY2thZ2VzLCBpbmNsdWRlPUZBTFNFfQpsaWJyYXJ5KERFU2VxMikKbGlicmFyeSh0aWR5dmVyc2UpCmBgYAoKIyMgTG9hZCBkYXRhCgpgYGB7cn0KcHljb21iYXRfYmF0Y2hfY29ycmVjdGVkX2NvdW50c19wYXRoID0gImRhdGEvYmF0Y2hfY29ycmVjdGVkX2RhdGEvcHljb21iYXRfYmF0Y2hfY29ycmVjdGVkX2NvdW50cy50c3YiCm1ldGFkYXRhX3BhdGggPSAiZGF0YS9mb3JtYXR0ZWRfcm5hc2VxX2RhdGEvbWV0YWRhdGEudHN2IgpweWNvbWJhdF9iYXRjaF9jb3JyZWN0ZWRfZGV4X3Jlc3VsdHNfcGF0aCA9ICJkYXRhL2RpZmZlcmVudGlhbF9hbmFseXNpcy9weWNvbWJhdF9iYXRjaF9jb3JyZWN0ZWRfZGV4X3Jlc3VsdHMudHN2IgpweWNvbWJhdF9iYXRjaF9jb3JyZWN0ZWRfZGV4X3NlbGVjdGVkX2dlbmVzX3BhdGggPSAiZGF0YS9kaWZmZXJlbnRpYWxfYW5hbHlzaXMvcHljb21iYXRfYmF0Y2hfY29ycmVjdGVkX2RleF9zZWxlY3RlZF9nZW5lcy50c3YiCmBgYAoKYGBge3J9CnB5Y29tYmF0X2JhdGNoX2NvcnJlY3RlZF9jb3VudHMgPSBhcy5kYXRhLmZyYW1lKHJlYWQudGFibGUoCiAgcHljb21iYXRfYmF0Y2hfY29ycmVjdGVkX2NvdW50c19wYXRoLAogIHNlcCA9ICJcdCIsCiAgaGVhZGVyID0gVFJVRSwKICByb3cubmFtZXMgPSAxCikpCgpwcmludChweWNvbWJhdF9iYXRjaF9jb3JyZWN0ZWRfY291bnRzKQoKbWV0YWRhdGEgPSBhcy5kYXRhLmZyYW1lKHJlYWQudGFibGUoCiAgbWV0YWRhdGFfcGF0aCwKICBzZXAgPSAiXHQiLAogIGhlYWRlciA9IFRSVUUsCiAgcm93Lm5hbWVzID0gMQopKQoKcHJpbnQobWV0YWRhdGEpCmBgYAoKIyMgREVTZXEKCiMjIyBTZXR1cCBkZHMKCmBgYHtyfQpkZXNpZ24gPSB+IHR5cGUgKyBkYXRhc2V0X25hbWUKCmRkcyA9IERFU2VxRGF0YVNldEZyb21NYXRyaXgoCiAgY291bnREYXRhID0gcHljb21iYXRfYmF0Y2hfY29ycmVjdGVkX2NvdW50cywKICBjb2xEYXRhID0gbWV0YWRhdGEsCiAgZGVzaWduID0gZGVzaWduKQpgYGAKCiMjIyBSdW4gZGVzZXEKCmBgYHtyfQpkZHMgPC0gREVTZXEoZGRzLCB0ZXN0PSJMUlQiLCByZWR1Y2VkID0gfiBkYXRhc2V0X25hbWUpCnNhdmUoZGRzLCBmaWxlID0gcGFzdGUwKCJkYXRhL2RpZmZlcmVudGlhbF9hbmFseXNpcy9weWNvbWJhdF9iYXRjaF9jb3JyZWN0ZWRfY291bnRzX2Rkcy5SRGF0YSIpKQpgYGAKIyMjIEV4YW1pbmUgcmVzdWx0cwoKYGBge3J9CnJlcyA9IGFzLmRhdGEuZnJhbWUocmVzdWx0cyhkZHMpKQoKIyBGb3JtYXQgcmVzdWx0cwpyZXMgPSBhcy5kYXRhLmZyYW1lKHJlcykKcmVzID0gcmVzICU+JSAKICByZWxvY2F0ZShwYWRqKSAlPiUKICByZWxvY2F0ZShwdmFsdWUpCgpwcmludChyZXMpCgpyZXNfcGF0aCA9IHBhc3RlMChweWNvbWJhdF9iYXRjaF9jb3JyZWN0ZWRfZGV4X3Jlc3VsdHNfcGF0aCkKd3JpdGUudGFibGUocmVzLCBmaWxlID0gcmVzX3BhdGgsIHNlcCA9ICJcdCIpCmBgYAojIyBTZWxlY3QgZmVhdHVyZXMKClZpc3VhbGl6ZSBzcHJlYWQgb2YgbDJmYyB2YWx1ZXMgdG8gZGV0ZXJtaW5lIGEgc2Vuc2libGUgY3V0b2ZmIGZvciBmZWF0dXJlIHNlbGVjdGlvbi4gTm90ZSB0aGF0IGwyZmMgZG9lcyBub3QgaGF2ZSBhIGRpcmVjdCBtZWFuaW5nIHNpbmNlIHdlJ3JlIGRvaW5nIGFuIExSVCB0ZXN0LgoKYGBge3J9CiMgQ2FsY3VsYXRlIHRoZSBjdW11bGF0aXZlIGNvdW50CnJlcyRhYnNfbDJmYyA9IGFicyhyZXMkbG9nMkZvbGRDaGFuZ2UpCnNvcnRlZF9hYnNfbDJmYyA8LSBzb3J0KHJlcyRhYnNfbDJmYykKY3VtdWxhdGl2ZV9jb3VudHMgPC0gY3Vtc3VtKHRhYmxlKHNvcnRlZF9hYnNfbDJmYykpCgojIFBsb3QgdGhlIGN1bXVsYXRpdmUgY291bnRzCnBsb3QobmFtZXMoY3VtdWxhdGl2ZV9jb3VudHMpLCBjdW11bGF0aXZlX2NvdW50cywgCiAgICAgdHlwZT0icyIsICAjIFN0ZXAgcGxvdAogICAgIG1haW49IkN1bXVsYXRpdmUgQ291bnQgUGxvdCBmb3IgTDJGQyBvZiBnZW5lcyIsCiAgICAgeGxhYj0iTDJGQyBWYWx1ZSIsIAogICAgIHlsYWI9IkN1bXVsYXRpdmUgQ291bnQiLAogICAgIGNvbD0iYmx1ZSIpCgpjdXRvZmYgPSAyICMgQ2hvc2VuIGZyb20gbG9va2luZyBhdCB0aGUgcGxvdAojIEFkZCBhIGhvcml6b250YWwgbGluZSBhdCBjdXRvZmYKYWJsaW5lKHYgPSBjdXRvZmYsIGNvbD0icmVkIiwgbHR5PTIpCgpzZWxlY3RlZF9nZW5lcyA9IHJlc1tyZXMkYWJzX2wyZmMgPiBjdXRvZmYsXQpwcmludChzZWxlY3RlZF9nZW5lcykKCnNlbGVjdGVkX2dlbmVzX3BhdGggPSBwYXN0ZTAocHljb21iYXRfYmF0Y2hfY29ycmVjdGVkX2RleF9zZWxlY3RlZF9nZW5lc19wYXRoKQp3cml0ZS50YWJsZShzZWxlY3RlZF9nZW5lcywgZmlsZSA9IHNlbGVjdGVkX2dlbmVzX3BhdGgsIHNlcCA9ICJcdCIpCgoKIyAjIElkZW50aWZ5IGluZmxlY3Rpb24gcG9pbnQKIyB4X3ZhbHVlcyA8LSBhcy5udW1lcmljKG5hbWVzKGN1bXVsYXRpdmVfY291bnRzKSkKIyB5X3ZhbHVlcyA8LSBhcy5udW1lcmljKGN1bXVsYXRpdmVfY291bnRzKQojIAojICMgQXBwcm94aW1hdGUgZmlyc3QgYW5kIHNlY29uZCBkZXJpdmF0aXZlcwojIGZpcnN0X2Rlcml2YXRpdmUgPC0gZGlmZih5X3ZhbHVlcykKIyBzZWNvbmRfZGVyaXZhdGl2ZSA8LSBkaWZmKGZpcnN0X2Rlcml2YXRpdmUpCiMgCiMgaW5mbGVjdGlvbl9wb2ludF9pbmRleCA8LSB3aGljaChkaWZmKHNpZ24oc2Vjb25kX2Rlcml2YXRpdmUpKSAhPSAwKSArIDEKIyBpbmZsZWN0aW9uX3BvaW50IDwtIHhfdmFsdWVzW2luZmxlY3Rpb25fcG9pbnRfaW5kZXhdCiMgCiMgIyBBZGQgdGhlIGluZmxlY3Rpb24gcG9pbnQgdG8gdGhlIHBsb3QKIyBwb2ludHMoaW5mbGVjdGlvbl9wb2ludCwgeV92YWx1ZXNbaW5mbGVjdGlvbl9wb2ludF9pbmRleF0sIGNvbD0icmVkIiwgcGNoPTE5KQojIGFibGluZSh2ID0gaW5mbGVjdGlvbl9wb2ludCwgY29sPSJyZWQiLCBsdHk9MikKIyAKIyAjIFByaW50IHRoZSBpbmZsZWN0aW9uIHBvaW50CiMgcHJpbnQocGFzdGUoIkluZmxlY3Rpb24gcG9pbnQgYXQgeCA9IiwgaW5mbGVjdGlvbl9wb2ludCkpCgpgYGAKCg==